A car insurance claim is a process wherein an insured asks the car insurance company to compensate him/her for the damages that are sustained by his/her car after an accident. Or at times, it is when the insured asks the insurance company to represent him/her or intervene on his/her behalf when the insured is responsible for any unintentional damages caused to a third-party. Since one pays car insurance premium, it is one’s right to make a claim when he/she is involved in an accident
we are required to build a model that will predict if the claims that were registered are fraud or not using the given data set.
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression,Ridge,Lasso
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from statsmodels.formula.api import ols
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.decomposition import PCA
import xgboost as xgb
from IPython.display import Image
from sklearn.model_selection import train_test_split
from IPython.display import Image
import plotly.express as px
import plotly.graph_objects as go
import os
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
pd.set_option('display.max_row', 100000)
pd.set_option('display.max_columns',500000)
os.getcwd()
dt_insurance=pd.read_csv('insurance_claims.csv')
dt_insurance.head()
dt_insurance.columns
dt_insurance.shape
corr_matrix=dt_insurance.corr()
corr_matrix
plt.figure(figsize=(20,15))
sns.heatmap(dt_insurance.corr(),annot=True,cmap='coolwarm')
dt_insurance['collision_type'].replace({'?':'None'},inplace=True)
dt_insurance=dt_insurance[dt_insurance['fraud_reported']=='Y']
df_polstate=dt_insurance.groupby('fraud_reported')['policy_state'].value_counts(normalize=True)
df_polstate = df_polstate.mul(100).rename('Percent').reset_index()
df_polstate['Percent']=df_polstate['Percent'].round(decimals=2)
df_polstate.head()
px.bar(df_polstate, x='fraud_reported', y='Percent', color='policy_state',title="Fraud reported as yes w.r.t State"
,barmode='group', text='Percent')
df_sex=dt_insurance.groupby('fraud_reported')['insured_sex'].value_counts(normalize=True)
df_sex = df_sex.mul(100).rename('Percent').reset_index()
df_sex['Percent']=df_sex['Percent'].round(decimals=2)
df_sex.head(10)
px.bar(df_sex, x='fraud_reported', y='Percent', color='insured_sex', title="Fraud reported as yes w.r.t Insured sex"
,barmode='group', text='Percent')
df_inci=dt_insurance.groupby('fraud_reported')['incident_type'].value_counts(normalize=True)
df_inci = df_inci.mul(100).rename('Percent').reset_index()
df_inci['Percent']=df_inci['Percent'].round(decimals=2)
df_inci.head(10)
px.bar(df_inci, x='fraud_reported', y='Percent', color='incident_type', title="Fraud reported as yes w.r.t Incedent type"
,barmode='group', text='Percent')
df_colli=dt_insurance.groupby('fraud_reported')['collision_type'].value_counts(normalize=True)
df_colli = df_colli.mul(100).rename('Percent').reset_index()
df_colli['Percent']=df_colli['Percent'].round(decimals=2)
df_colli.head(10)
px.bar(df_colli, x='fraud_reported', y='Percent', color='collision_type', title="Fraud reported as yes w.r.t Collison type",
barmode='group', text='Percent')
df_edu=dt_insurance.groupby('fraud_reported')['insured_education_level'].value_counts(normalize=True)
df_edu = df_edu.mul(100).rename('Percent').reset_index()
df_edu['Percent']=df_edu['Percent'].round(decimals=2)
df_edu.head(10)
px.bar(df_edu, x='fraud_reported', y='Percent', color='insured_education_level', title="Fraud reported as yes w.r.t insured education",
barmode='group', text='Percent')
df_hobby=dt_insurance.groupby('fraud_reported')['insured_hobbies'].value_counts(normalize=True)
df_hobby = df_hobby.mul(100).rename('Percent').reset_index()
df_hobby['Percent']=df_hobby['Percent'].round(decimals=2)
df_hobby.head(10)
px.bar(df_hobby, x='fraud_reported', y='Percent', color='insured_hobbies', title="Fraud reported as yes w.r.t insured hobbies",
barmode='group', text='Percent')
df_city=dt_insurance.groupby('fraud_reported')['incident_city'].value_counts(normalize=True)
df_city = df_city.mul(100).rename('Percent').reset_index()
df_city['Percent']=df_city['Percent'].round(decimals=2)
df_city.head(10)
px.bar(df_city, x='fraud_reported', y='Percent', color='incident_city', title="Fraud reported as yes w.r.t Incident City"
,barmode='group', text='Percent')
dt_insurance.dtypes
dt_insurance['incident_date']=pd.to_datetime(dt_insurance['incident_date'].astype('datetime64[ns]'),format='%y%m%d')
dt_insurance['policy_bind_date']=pd.to_datetime(dt_insurance['policy_bind_date'].astype('datetime64[ns]'),format='%y%m%d')
dt_insurance['incedent_month']=pd.DatetimeIndex(dt_insurance['incident_date']).month
dt_insurance['incedent_year']=pd.DatetimeIndex(dt_insurance['incident_date']).year
dt_insurance.drop(['policy_bind_date','incident_date'],axis=1,inplace=True)
dt_insurance=pd.read_csv('insurance_claims.csv')
dt_insurance.property_damage.unique()
xyz=pd.get_dummies(dt_insurance['property_damage'],drop_first=True)
dt_insurance=pd.concat([dt_insurance,xyz],axis=1)
dt_insurance.drop('property_damage',axis=1,inplace=True)
dt_insurance.head()
insured_map=dt_insurance['police_report_available'].value_counts().to_dict()
insured_map
dt_insurance['police_report_available']=dt_insurance['police_report_available'].map(insured_map)
dt_insurance.head()
X=dt_insurance.drop('fraud_reported',axis=1)
y=dt_insurance['fraud_reported']
X.head()
y.head()
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.feature_selection import chi2
ordered_rank_features=SelectKBest(f_classif,k=20)
ordered_feature=ordered_rank_features.fit(X,dt_insurance['fraud_reported'])
dfscores=pd.DataFrame(ordered_feature.scores_,columns=["Score"])
dfcolumns=pd.DataFrame(X.columns)
features_rank=pd.concat([dfcolumns,dfscores],axis=1)
features_rank.columns=['Features','Score']
features_rank
features_rank.nlargest(20,'Score')
dt_ins_ver=dt_insurance[["Minor Damage","Total Loss","vehicle_claim",
"total_claim_amount","insured_hobbies","property_claim",
"Trivial Damage","Vehicle Theft",
"Parked Car","Rear Collision","injury_claim","NO",
"Single Vehicle Collision",
"umbrella_limit",
"number_of_vehicles_involved","witnesses",
"incident_state","Front Collision","auto_model","bodily_injuries","fraud_reported"]].copy()
dt_ins_ver.head()
plt.figure(figsize=(8,8))
sns.countplot('fraud_reported',data=dt_ins_ver)
plt.show()
dt_ins_ver.fraud_reported.value_counts()
x=dt_ins_ver.drop('fraud_reported',axis=1)
y=dt_ins_ver['fraud_reported']
x.shape,y.shape
smk = SMOTETomek(random_state=42)
X_res,y_res=smk.fit_sample(x,y)
x_train,x_test,y_train,y_test=train_test_split(X_res,y_res,test_size=0.3,random_state=42)
x_train.shape,y_train.shape,x_test.shape,y_test.shape
log_reg=LogisticRegression()
log_reg.fit(x_train,y_train)
log_pred=log_reg.predict(x_test)
cm1=confusion_matrix(y_test,log_pred)
sns.heatmap(cm1,annot=True,fmt='d')
print(accuracy_score(y_test,log_pred))
print(classification_report(y_test,log_pred))
model_rand=RandomForestClassifier()
model_rand.fit(x_train,y_train)
model_rand_test=model_rand.predict(x_test)
cm1=confusion_matrix(y_test,model_rand_test)
sns.heatmap(cm1,annot=True,fmt='d')
print('Accuracy Score:',accuracy_score(y_test,model_rand_test))
print(classification_report(y_test,model_rand_test))
Random Forest is giving better accuracy then Logistic Regression with 20 parameter
- For Random Forest Accuracy got decresed when I am taking 25 variable for model building
- For Random Forest Accuracy got reamins almost same when I am taking 15 and 20 variable
from sklearn.model_selection import GridSearchCV
model_params = {
'n_estimators': [50, 150, 250],
'max_features': ['sqrt', 0.25, 0.5, 0.75, 1.0],
'min_samples_split': [2, 4, 6]
}
rf_model = RandomForestClassifier(random_state=1)
clf = GridSearchCV(rf_model, model_params, cv=5)
model = clf.fit(x_train,y_train)
grid_predict=model.predict(x_test)
cm1=confusion_matrix(y_test,grid_predict)
sns.heatmap(cm1,annot=True,fmt='d')
print(accuracy_score(y_test,grid_predict))
print(classification_report(y_test,grid_predict))
- For Random Forest Accuracy got increased a bit when I apply hyper parameter technique
- After applying hyper parameter tuning I got the best accuracy of 88% for this dataset
- After applying K-best method I can conclude that following variables can be used to detect a fraudulant claim with 88% accuracy
- incident_severity, vehicle_claim, total_claim_amount, property_claim, incident_type, umbrella_limit, auto_model, incident_state, insured_hobbies, collision_type, bodily_injuries